DataViz

Data Visualization

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.2.0
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(esquisse)
library(lubridate)
library(RColorBrewer)

Let’s load our dataset.

bike = read_csv("dc_bike.csv")
Rows: 17379 Columns: 15
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (6): dteday, mnth, holiday, weekday, workingday, weathersit
dbl (9): instant, hr, temp, atemp, hum, windspeed, casual, registered, count

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(bike)
spc_tbl_ [17,379 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ instant   : num [1:17379] 1 2 3 4 5 6 7 8 9 10 ...
 $ dteday    : chr [1:17379] "1/1/2011" "1/1/2011" "1/1/2011" "1/1/2011" ...
 $ mnth      : chr [1:17379] "Jan" "Jan" "Jan" "Jan" ...
 $ hr        : num [1:17379] 0 1 2 3 4 5 6 7 8 9 ...
 $ holiday   : chr [1:17379] "NotHoliday" "NotHoliday" "NotHoliday" "NotHoliday" ...
 $ weekday   : chr [1:17379] "Saturday" "Saturday" "Saturday" "Saturday" ...
 $ workingday: chr [1:17379] "NotWorkingDay" "NotWorkingDay" "NotWorkingDay" "NotWorkingDay" ...
 $ weathersit: chr [1:17379] "NoPrecip" "NoPrecip" "NoPrecip" "NoPrecip" ...
 $ temp      : num [1:17379] 0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
 $ atemp     : num [1:17379] 0.288 0.273 0.273 0.288 0.288 ...
 $ hum       : num [1:17379] 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
 $ windspeed : num [1:17379] 0 0 0 0 0 0.0896 0 0 0 0 ...
 $ casual    : num [1:17379] 3 8 5 3 0 0 2 1 1 8 ...
 $ registered: num [1:17379] 13 32 27 10 1 1 0 2 7 6 ...
 $ count     : num [1:17379] 16 40 32 13 1 1 2 3 8 14 ...
 - attr(*, "spec")=
  .. cols(
  ..   instant = col_double(),
  ..   dteday = col_character(),
  ..   mnth = col_character(),
  ..   hr = col_double(),
  ..   holiday = col_character(),
  ..   weekday = col_character(),
  ..   workingday = col_character(),
  ..   weathersit = col_character(),
  ..   temp = col_double(),
  ..   atemp = col_double(),
  ..   hum = col_double(),
  ..   windspeed = col_double(),
  ..   casual = col_double(),
  ..   registered = col_double(),
  ..   count = col_double()
  .. )
 - attr(*, "problems")=<externalptr> 
summary(bike)
    instant         dteday              mnth                 hr       
 Min.   :    1   Length:17379       Length:17379       Min.   : 0.00  
 1st Qu.: 4346   Class :character   Class :character   1st Qu.: 6.00  
 Median : 8690   Mode  :character   Mode  :character   Median :12.00  
 Mean   : 8690                                         Mean   :11.55  
 3rd Qu.:13034                                         3rd Qu.:18.00  
 Max.   :17379                                         Max.   :23.00  
   holiday            weekday           workingday         weathersit       
 Length:17379       Length:17379       Length:17379       Length:17379      
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
      temp           atemp             hum           windspeed     
 Min.   :0.020   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:0.340   1st Qu.:0.3333   1st Qu.:0.4800   1st Qu.:0.1045  
 Median :0.500   Median :0.4848   Median :0.6300   Median :0.1940  
 Mean   :0.497   Mean   :0.4758   Mean   :0.6272   Mean   :0.1901  
 3rd Qu.:0.660   3rd Qu.:0.6212   3rd Qu.:0.7800   3rd Qu.:0.2537  
 Max.   :1.000   Max.   :1.0000   Max.   :1.0000   Max.   :0.8507  
     casual         registered        count      
 Min.   :  0.00   Min.   :  0.0   Min.   :  1.0  
 1st Qu.:  4.00   1st Qu.: 34.0   1st Qu.: 40.0  
 Median : 17.00   Median :115.0   Median :142.0  
 Mean   : 35.68   Mean   :153.8   Mean   :189.5  
 3rd Qu.: 48.00   3rd Qu.:220.0   3rd Qu.:281.0  
 Max.   :367.00   Max.   :886.0   Max.   :977.0  

Let’s do a bit of variable type conversion before we proceed.

bike = bike %>% mutate_if(is.character, as_factor)

Examine structure

str(bike)
tibble [17,379 × 15] (S3: tbl_df/tbl/data.frame)
 $ instant   : num [1:17379] 1 2 3 4 5 6 7 8 9 10 ...
 $ dteday    : Factor w/ 731 levels "1/1/2011","1/2/2011",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ mnth      : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ hr        : num [1:17379] 0 1 2 3 4 5 6 7 8 9 ...
 $ holiday   : Factor w/ 2 levels "NotHoliday","Holiday": 1 1 1 1 1 1 1 1 1 1 ...
 $ weekday   : Factor w/ 7 levels "Saturday","Sunday",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ workingday: Factor w/ 2 levels "NotWorkingDay",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ weathersit: Factor w/ 4 levels "NoPrecip","Misty",..: 1 1 1 1 1 2 1 1 1 1 ...
 $ temp      : num [1:17379] 0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
 $ atemp     : num [1:17379] 0.288 0.273 0.273 0.288 0.288 ...
 $ hum       : num [1:17379] 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
 $ windspeed : num [1:17379] 0 0 0 0 0 0.0896 0 0 0 0 ...
 $ casual    : num [1:17379] 3 8 5 3 0 0 2 1 1 8 ...
 $ registered: num [1:17379] 13 32 27 10 1 1 0 2 7 6 ...
 $ count     : num [1:17379] 16 40 32 13 1 1 2 3 8 14 ...

Also convert the hour variable to a factor and the date variable to a date object.

bike = bike %>% mutate(hr = as_factor(hr)) %>% 
  mutate(dteday = mdy(dteday))

Double-check structure

str(bike)
tibble [17,379 × 15] (S3: tbl_df/tbl/data.frame)
 $ instant   : num [1:17379] 1 2 3 4 5 6 7 8 9 10 ...
 $ dteday    : Date[1:17379], format: "2011-01-01" "2011-01-01" ...
 $ mnth      : Factor w/ 12 levels "Jan","Feb","Mar",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ hr        : Factor w/ 24 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
 $ holiday   : Factor w/ 2 levels "NotHoliday","Holiday": 1 1 1 1 1 1 1 1 1 1 ...
 $ weekday   : Factor w/ 7 levels "Saturday","Sunday",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ workingday: Factor w/ 2 levels "NotWorkingDay",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ weathersit: Factor w/ 4 levels "NoPrecip","Misty",..: 1 1 1 1 1 2 1 1 1 1 ...
 $ temp      : num [1:17379] 0.24 0.22 0.22 0.24 0.24 0.24 0.22 0.2 0.24 0.32 ...
 $ atemp     : num [1:17379] 0.288 0.273 0.273 0.288 0.288 ...
 $ hum       : num [1:17379] 0.81 0.8 0.8 0.75 0.75 0.75 0.8 0.86 0.75 0.76 ...
 $ windspeed : num [1:17379] 0 0 0 0 0 0.0896 0 0 0 0 ...
 $ casual    : num [1:17379] 3 8 5 3 0 0 2 1 1 8 ...
 $ registered: num [1:17379] 13 32 27 10 1 1 0 2 7 6 ...
 $ count     : num [1:17379] 16 40 32 13 1 1 2 3 8 14 ...

Now we’re ready build some data visualizations.

Let’s base our visualizations around a series of questions that we want to potentially answer.

How does the total number of rides change by hour of the day?

ggplot(bike,aes(x=hr,y=count)) + geom_boxplot() + theme_bw()

What about by month?

ggplot(bike,aes(x=mnth,y=count)) + geom_boxplot() + theme_bw()

By day of week

ggplot(bike,aes(x=weekday,y=count)) + geom_boxplot() + theme_bw()

Some faceting.

ggplot(bike,aes(x=hr,y=count)) + geom_boxplot() + theme_bw() + facet_wrap(~workingday,ncol =1)

Has the bike share program grown over time? Plot sum of daily rides over time.

bike %>% group_by(dteday) %>% summarise(sum = sum(count)) %>%
ggplot(aes(x=dteday,y=sum)) + geom_line() + theme_bw()

bike %>% group_by(dteday) %>% summarise(sum = sum(count)) %>%
ggplot(aes(x=dteday,y=sum)) + geom_line() + geom_smooth(method = "lm", se = FALSE) + theme_bw()
`geom_smooth()` using formula = 'y ~ x'

Is there a relationship between temperature and ride count?

ggplot(bike,aes(x=temp,y=count)) + geom_point() + theme_bw()

Use alpha to alleviate overplotting.

ggplot(bike,aes(x=temp,y=count)) + geom_point(alpha=0.05) + theme_bw()

Add some color

ggplot(bike,aes(x=temp,y=count,color=workingday)) + geom_point(alpha=0.1) + theme_bw()

Not sure that this works well, let’s facet.

ggplot(bike,aes(x=temp,y=count,color=workingday)) + geom_point(alpha=0.1) + 
  facet_wrap(~workingday) + theme_bw()

Keep playing with this a bit.

ggplot(bike,aes(x=temp,y=count,color=workingday)) + geom_point(alpha=0.1) + 
  facet_wrap(~weathersit) + theme_bw()

Manually adjusting colors

ggplot(bike,aes(x=temp,y=count,color=workingday)) + geom_point(alpha=0.1) + 
  facet_wrap(~workingday) + theme_bw() +
  scale_color_manual(values=c("gray","blue"))

Let’s look at a histogram of temp and see how we can adjust colors of fills.

ggplot(bike,aes(x=temp)) + geom_histogram() + theme_bw()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Let’s look at just three months out of the year: Jan, Mar, and Jul. Use a filter to isolate those.

bike %>% filter(mnth == "Jan" | mnth == "Mar" | mnth == "Jul") %>%
ggplot(aes(x=temp,fill=mnth)) + geom_histogram() + theme_bw()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Adjust the histograms so that they overlap and have some transparency.

bike %>% filter(mnth == "Jan" | mnth == "Mar" | mnth == "Jul") %>%
ggplot(aes(x=temp,fill=mnth)) + 
  geom_histogram(position = "identity", alpha = 0.4) + theme_bw()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Adjust the colors manually.

bike %>% filter(mnth == "Jan" | mnth == "Mar" | mnth == "Jul") %>%
ggplot(aes(x=temp,fill=mnth)) + 
  geom_histogram(position = "identity", alpha = 0.4) + theme_bw() + 
  scale_fill_manual(values=c("blue","green","red"))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We can also apply palettes. See here for discussion: http://www.sthda.com/english/wiki/ggplot2-colors-how-to-change-colors-automatically-and-manually#use-rcolorbrewer-palettes

bike %>% filter(mnth == "Jan" | mnth == "Mar" | mnth == "Jul") %>%
ggplot(aes(x=temp,fill=mnth)) + 
  geom_histogram(position = "identity", alpha = 0.4) + theme_bw() + 
  scale_fill_brewer(palette="Set2")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Finally, let’s add some labels.

bike %>% filter(mnth == "Jan" | mnth == "Mar" | mnth == "Jul") %>%
ggplot(aes(x=temp,fill=mnth)) + 
  geom_histogram(position = "identity", alpha = 0.4) + theme_bw() + 
  scale_fill_brewer(palette="Set2") +
  xlab("Temperature") + ylab("Count") + ggtitle("Rides by Month by Temperature") +
  labs(fill="Month") 
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Save the plot to file

ggsave("plot.jpg",width=6,height=4)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.